# -*- coding: utf-8 -*-
# @Time    : 2025/2/11 21:00
# @Author  : 
# @File    : dataset.py
# @Software: PyCharm 
# @Comment : 数据集预处理及构建

import datasets

from config import Config

def qwen_process_verilog_dataset_v3():
    raw_dataset = datasets.load_dataset(Config.raw_datasets_base_path + "/emilgoh/verilog-dataset-v3")

    processed_dataset = raw_dataset.map(
        lambda example: {
            "instruction": example["system_prompt"],
            "input": example["instruction"],
            "output": example["output"]
        },
        remove_columns=raw_dataset["train"].column_names,
        desc="Preprocessing Qwen-2.5-Coder-7B-Instruct dataset",
    )

    # 划分数据集
    processed_dataset = processed_dataset["train"].train_test_split(test_size=0.1)


    # 保存预处理后的数据集（覆盖）
    processed_dataset.save_to_disk(Config.processed_datasets_base_path + "/emilgoh/verilog-dataset-v3")



if __name__ == '__main__':
    qwen_process_verilog_dataset_v3()

    dataset = datasets.load_from_disk(Config.processed_datasets_base_path + "/emilgoh/verilog-dataset-v3")

    # 打印数据集信息
    print(dataset)

